library(corrplot)
## corrplot 0.95 loaded
library(lubridate)
## 
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     date, intersect, setdiff, union
library(zoo)
## 
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     as.Date, as.Date.numeric
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.4     ✔ readr   2.1.6
## ✔ forcats 1.0.1     ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1     ✔ tibble  3.3.0
## ✔ purrr   1.2.1     ✔ tidyr   1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
library(asaur)
library(broom)
library(survminer)
## Le chargement a nécessité le package : ggpubr
## 
## Attachement du package : 'survminer'
## 
## L'objet suivant est masqué depuis 'package:survival':
## 
##     myeloma
library(ggfortify)
library(gtsummary)
library(ggplot2)
library(GGally)
library(ggsurvfit)

Get the Data

data_orig = read.csv('../../database_1perDay_isWarm8.csv', sep=',', header=T)

Dataset

cat("--- Dimension of the dataset: ", dim(data_orig), "\n")
## --- Dimension of the dataset:  707 15
cat("--- Number of missing values: ", sum(is.na(data_orig)), "\n") # sum=0 -> no missing values
## --- Number of missing values:  276
head(data_orig)
##   id start stop event isWarm latitude longitude temperature windSpeed pressure
## 1  1     0   88     1      0     47.5      6.95        15.6         7    97510
## 2  2     0   79     0      1     47.5      6.95        13.9         5    98170
## 3  2    79   89     1      1     47.5      6.95        19.8         2    99260
## 4  3     0   83     1      0     47.5      6.95        10.0         3    96790
## 5  4     0   83     1      0     47.5      6.95        16.1         2    99110
## 6  5     0  163     0      1     47.5      6.95        26.7         5    98860
##   humidity visibility nebulosity cloudHeight moonPhase
## 1       52      40000         75        1250        72
## 2       58      40000         90         800        44
## 3       42      50000         90        1750        17
## 4       53      20000         10        1250        62
## 5       53       7000         10          NA        77
## 6       49      35000         90        1750         4
tbl_summary(data_orig)
Characteristic N = 7071
id 251 (119, 377)
start 0 (0, 157)
stop 157 (116, 185)
event 409 (58%)
isWarm 596 (84%)
latitude
    47.5 79 (11%)
    47.69 64 (9.1%)
    47.88 83 (12%)
    48.06 103 (15%)
    48.25 67 (9.5%)
    48.44 58 (8.2%)
    48.62 82 (12%)
    48.81 98 (14%)
    49 73 (10%)
longitude
    6.95 72 (10%)
    7.15 121 (17%)
    7.35 157 (22%)
    7.55 126 (18%)
    7.75 113 (16%)
    7.95 56 (7.9%)
    8.15 62 (8.8%)
temperature 21 (16, 25)
windSpeed 3.00 (2.00, 4.00)
pressure 98,960 (98,400, 99,770)
humidity 52 (42, 64)
visibility 25,000 (15,050, 40,000)
    Unknown 2
nebulosity 90 (60, 90)
    Unknown 152
cloudHeight 1,250 (800, 1,750)
    Unknown 110
moonPhase 56 (15, 86)
    Unknown 12
1 Median (Q1, Q3); n (%)
data_clean <- data_orig

# Add a season variable to allow the risk to vary with seasons:
data_clean$season <- factor(quarters(as.Date(data_orig$stop)))

barplot(prop.table(table(data_clean$season)))

Try to fix the outliers

The ciconia can fly up to 4 800m high so limit the value of cloudHeight to 3 000

data_clean$cloudHeight[data_clean$cloudHeight>2000] <- 2000

par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
hist(data_orig$cloudHeight, breaks=20)
hist(data_clean$cloudHeight, breaks=10)

The ciconia has difficulties flying with winds higher than 3-5m/s

data_clean$windSpeed[data_clean$windSpeed>5] <- 5

par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
hist(data_orig$windSpeed)
hist(data_clean$windSpeed)

Remove data with observation time after day 212 (1 august) as they are usually observed much earlier. They are bad measurements ?

ids <- unique(data_clean$id[data_clean$stop>212])

ids
##  [1]  31  45  46  51 115 139 144 149 151 155 166 174 191 229 230 246 271 287 289
## [20] 292 320 337 360 368 387 403 407 420 428 429 458
data_clean <- data_clean[! data_clean$id %in% ids,]

par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data_orig$stop, data_orig$temperature)
hist(data_orig$stop, breaks=30)

plot(data_clean$stop, data_clean$temperature)
hist(data_clean$stop, breaks=30)

Normalization of the dataset

# Normalize the non-boolean data for all attributes to have mean=0 and std=1.
# with or without normalizing binary data does not affect the result !!
data <- data_clean

data[c(6:15)] <- lapply(data[c(6:15)], function(x) c(scale(x)))

Replace missing values

data[is.na(data)] <- 0

summary(data)
##        id            start             stop           event      
##  Min.   :  1.0   Min.   : -7.00   Min.   : -7.0   Min.   :0.000  
##  1st Qu.:116.0   1st Qu.:  0.00   1st Qu.:114.0   1st Qu.:0.000  
##  Median :252.0   Median :  0.00   Median :152.0   Median :1.000  
##  Mean   :250.2   Mean   : 63.55   Mean   :140.6   Mean   :0.586  
##  3rd Qu.:378.0   3rd Qu.:152.00   3rd Qu.:173.0   3rd Qu.:1.000  
##  Max.   :503.0   Max.   :211.00   Max.   :212.0   Max.   :1.000  
##      isWarm          latitude          longitude        temperature     
##  Min.   :0.0000   Min.   :-1.55631   Min.   :-1.5585   Min.   :-3.4443  
##  1st Qu.:1.0000   1st Qu.:-0.77687   1st Qu.:-0.9811   1st Qu.:-0.6353  
##  Median :1.0000   Median :-0.01794   Median : 0.1737   Median : 0.0779  
##  Mean   :0.8279   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:1.0000   3rd Qu.: 0.74099   3rd Qu.: 0.7511   3rd Qu.: 0.7620  
##  Max.   :1.0000   Max.   : 1.52044   Max.   : 1.9059   Max.   : 2.3047  
##    windSpeed          pressure           humidity         visibility     
##  Min.   :-2.1648   Min.   :-3.26051   Min.   :-2.0011   Min.   :-1.8175  
##  1st Qu.:-0.7458   1st Qu.:-0.74509   1st Qu.:-0.7689   1st Qu.:-0.8595  
##  Median :-0.0363   Median :-0.08085   Median :-0.1528   Median :-0.1905  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6732   3rd Qu.: 0.76851   3rd Qu.: 0.5865   3rd Qu.: 0.8131  
##  Max.   : 1.3827   Max.   : 2.66324   Max.   : 2.8044   Max.   : 3.1547  
##    nebulosity        cloudHeight         moonPhase       season  
##  Min.   :-2.50903   Min.   :-2.39169   Min.   :-1.4763   Q1: 96  
##  1st Qu.:-0.42996   1st Qu.:-0.91927   1st Qu.:-1.0491   Q2:416  
##  Median : 0.08981   Median :-0.05877   Median : 0.0899   Q3:131  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Q4:  2  
##  3rd Qu.: 0.60957   3rd Qu.: 0.89735   3rd Qu.: 0.9442           
##  Max.   : 0.99074   Max.   : 1.37541   Max.   : 1.3713
head(data)
##   id start stop event isWarm  latitude longitude  temperature  windSpeed
## 1  1     0   88     1      0 -1.556309  -1.55854 -0.620721914  1.3827075
## 2  2     0   79     0      1 -1.556309  -1.55854 -0.868148691  1.3827075
## 3  2    79   89     1      1 -1.556309  -1.55854 -0.009432229 -0.7458041
## 4  3     0   83     1      0 -1.556309  -1.55854 -1.435774827 -0.0363002
## 5  4     0   83     1      0 -1.556309  -1.55854 -0.547949332 -0.7458041
## 6  5     0  163     0      1 -1.556309  -1.55854  0.994829396  1.3827075
##      pressure    humidity visibility  nebulosity cloudHeight  moonPhase season
## 1 -1.70334930 -0.15282675  0.8130947  0.08980589 -0.05876531  0.5739934     Q1
## 2 -0.98465860  0.21682295  0.8130947  0.60957289 -0.91926988 -0.2233388     Q1
## 3  0.20226999 -0.76890959  1.4821295  0.60957289  0.89735087 -0.9921949     Q1
## 4 -2.48737552 -0.09121847 -0.5249748 -2.16251774 -0.05876531  0.2892319     Q1
## 5  0.03893119 -0.09121847 -1.3947201 -2.16251774  0.00000000  0.7163741     Q1
## 6 -0.23330014 -0.33765160  0.4785773  0.60957289  0.89735087 -1.3623849     Q2

Remove Outliers

## See linear variation of temperature and time:
plot(data$stop, data$temperature, lwd=2)

plot(data$stop, data$temperature/data$stop)
abline(h=0.05, lty=2)
abline(h=-0.2, lty=2)

# remove outliers:
data <- data[data$temperature/data$stop <= 0.05,]
data <- data[data$temperature/data$stop >= -0.2,]

Some plots

old.par = par(mar = c(6, 5, 2, 2))
par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$temperature, lwd=2, xlab="Days to event", ylab="Temperature [°C]")
boxplot(data$temperature, ylab="Temperature [°C]")

plot(data$stop, data$windSpeed  , lwd=2, xlab="Days to event", ylab="Wind Speed  [m/s]")
boxplot(data$windSpeed, ylab="Wind Speed [m/s]")

plot(data$stop, data$pressure   , lwd=2, xlab="Days to event", ylab="Pressure [Pa]")
boxplot(data$pressure, ylab="Pressure [Pa]")

par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$humidity   , lwd=2, xlab="Days to event", ylab="Humidity [%]")
boxplot(data$humidity, ylab="Humidity [%]")

plot(data$stop, data$visibility   , lwd=2, xlab="Days to event", ylab="Visibility [%]")
boxplot(data$visibility, ylab="Visibility [%]")

plot(data$stop, data$nebulosity   , lwd=2, xlab="Days to event", ylab="Nebulosiity [%]")
boxplot(data$nebulosity, ylab="Nebulosity [%]")

par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$cloudHeight   , lwd=2, xlab="Days to event", ylab="Cloud Height [m]")
boxplot(data$cloudHeight, ylab="Cloud Height [m]")

plot(data$stop, data$moonPhase   , lwd=2, xlab="Days to event", ylab="Moon Phase [%]")
boxplot(data$moonPhase, ylab="Moon Phase [%]")

hist(data$isWarm)
hist(data$event)

ggplot(stack(data[, !names(data) %in% c("stop", "start", "id", "event", "season", "isWarm")]), aes(x=ind, y=values)) + geom_boxplot() + coord_flip()

old.par = par(mar = c(6, 5, 2, 2))
par(mfrow=c(3,3), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$temperature, lwd=2, xlab="Days to event", ylab="Temperature [°C]")
plot(data$stop, data$windSpeed  , lwd=2, xlab="Days to event", ylab="Wind Speed  [m/s]")
plot(data$stop, data$pressure   , lwd=2, xlab="Days to event", ylab="Pressure [Pa]")
plot(data$stop, data$humidity   , lwd=2, xlab="Days to event", ylab="Humidity [%]")
plot(data$stop, data$visibility   , lwd=2, xlab="Days to event", ylab="Visibility [%]")
plot(data$stop, data$nebulosity   , lwd=2, xlab="Days to event", ylab="Nebulosiity [%]")
plot(data$stop, data$cloudHeight   , lwd=2, xlab="Days to event", ylab="Cloud Height [m]")
plot(data$stop, data$moonPhase   , lwd=2, xlab="Days to event", ylab="Moon Phase [%]")

Correlation matrix

par(mfrow=c(1,1))
corrplot(cor(data[1:15]), tl.col="black", main="Correlation Matrix")

SURVIVAL ANALYSIS

COX

Mnone  <- coxph(Surv(start, stop, event)~1          , data=data)
Mtemp  <- coxph(Surv(start, stop, event)~temperature, data=data)
Mwarm  <- coxph(Surv(start, stop, event)~isWarm     , data=data)
Mwind  <- coxph(Surv(start, stop, event)~windSpeed  , data=data)
Mcloud <- coxph(Surv(start, stop, event)~cloudHeight, data=data)

cat("--- Mtemp ---\n\n")
## --- Mtemp ---
summary(Mtemp)
## Call:
## coxph(formula = Surv(start, stop, event) ~ temperature, data = data)
## 
##   n= 641, number of events= 376 
## 
##                 coef exp(coef) se(coef)      z Pr(>|z|)    
## temperature -0.90967   0.40266  0.06867 -13.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             exp(coef) exp(-coef) lower .95 upper .95
## temperature    0.4027      2.484     0.352    0.4607
## 
## Concordance= 0.751  (se = 0.016 )
## Likelihood ratio test= 179.9  on 1 df,   p=<2e-16
## Wald test            = 175.5  on 1 df,   p=<2e-16
## Score (logrank) test = 182.3  on 1 df,   p=<2e-16
# p-value < 0.01 --> reject H0. Difference between Mnone and Mtemp is higly significant -> should keep temperature

cat("--- Mwarm ---\n\n")
## --- Mwarm ---
summary(Mwarm)
## Call:
## coxph(formula = Surv(start, stop, event) ~ isWarm, data = data)
## 
##   n= 641, number of events= 376 
## 
##           coef exp(coef) se(coef)      z Pr(>|z|)    
## isWarm -3.2649    0.0382   0.1973 -16.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## isWarm    0.0382      26.18   0.02595   0.05624
## 
## Concordance= 0.695  (se = 0.012 )
## Likelihood ratio test= 304.4  on 1 df,   p=<2e-16
## Wald test            = 273.8  on 1 df,   p=<2e-16
## Score (logrank) test = 463.4  on 1 df,   p=<2e-16
# p-value  < 0.01 --> reject H0. Difference between Mnone and Mwarm is highly significant -> should keep isWarm

cat("--- Mwind ---\n\n")
## --- Mwind ---
summary(Mwind)
## Call:
## coxph(formula = Surv(start, stop, event) ~ windSpeed, data = data)
## 
##   n= 641, number of events= 376 
## 
##               coef exp(coef) se(coef)     z Pr(>|z|)  
## windSpeed -0.11485   0.89150  0.05268 -2.18   0.0293 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## windSpeed    0.8915      1.122     0.804    0.9885
## 
## Concordance= 0.544  (se = 0.022 )
## Likelihood ratio test= 4.78  on 1 df,   p=0.03
## Wald test            = 4.75  on 1 df,   p=0.03
## Score (logrank) test = 4.77  on 1 df,   p=0.03
# p-value < 0.05 --> reject H0. Difference between Mnone and Mwind is significant -> should keep windSpeed

cat("--- Mcloud ---\n\n")
## --- Mcloud ---
summary(Mcloud)
## Call:
## coxph(formula = Surv(start, stop, event) ~ cloudHeight, data = data)
## 
##   n= 641, number of events= 376 
## 
##                coef exp(coef) se(coef)     z Pr(>|z|)  
## cloudHeight 0.11946   1.12689  0.06012 1.987   0.0469 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             exp(coef) exp(-coef) lower .95 upper .95
## cloudHeight     1.127     0.8874     1.002     1.268
## 
## Concordance= 0.53  (se = 0.023 )
## Likelihood ratio test= 3.98  on 1 df,   p=0.05
## Wald test            = 3.95  on 1 df,   p=0.05
## Score (logrank) test = 3.95  on 1 df,   p=0.05
# p-value < 0.05 --> reject H0. Difference between Mnone and Mcloud is significant -> should keep cloudHeight

Comparing nested models

ANOVA

# Regarding the correlation Matrix, Temperature, warmBefore are the most significant covariates
MtempWarm <- coxph(Surv(start, stop, event)~temperature + isWarm + strata(season)       , data=data)
MtempWarmWindCloud <- coxph(Surv(start, stop, event)~temperature + isWarm + strata(season) + windSpeed + cloudHeight       , data=data)
Mall      <- coxph(Surv(start, stop, event)~latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season), data=data)

anova(MtempWarm, Mall)
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ temperature + isWarm + strata(season)
##  Model 2: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1470.7                         
## 2 -1435.0 71.436  9  7.959e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value < 0.05 --> reject H0. Difference between the models is highly significant -> should keep more covariates than just Temperature and isWarm

anova(MtempWarmWindCloud, Mall)
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ temperature + isWarm + strata(season) + windSpeed + cloudHeight
##  Model 2: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1450.7                         
## 2 -1435.0 31.517  7  4.989e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value = 0.01257, < 0.05 --> reject H0. Difference between the models is significant -> more covariates are necessary to explain the data
# Model selection using all covariates:
MAIC <- step(Mall)
## Start:  AIC=2891.96
## Surv(start, stop, event) ~ latitude + longitude + isWarm + temperature + 
##     windSpeed + pressure + humidity + visibility + nebulosity + 
##     cloudHeight + moonPhase + strata(season)
## 
## 
## Step:  AIC=3270.68
## Surv(start, stop, event) ~ latitude + longitude + isWarm + temperature + 
##     windSpeed + pressure + humidity + visibility + nebulosity + 
##     cloudHeight + moonPhase
summary(MAIC)
## Call:
## coxph(formula = Surv(start, stop, event) ~ latitude + longitude + 
##     isWarm + temperature + windSpeed + pressure + humidity + 
##     visibility + nebulosity + cloudHeight + moonPhase, data = data)
## 
##   n= 641, number of events= 376 
## 
##                 coef exp(coef) se(coef)       z Pr(>|z|)    
## latitude     0.04566   1.04671  0.08351   0.547 0.584588    
## longitude    0.16842   1.18343  0.06528   2.580 0.009880 ** 
## isWarm      -2.07290   0.12582  0.23488  -8.825  < 2e-16 ***
## temperature -1.18656   0.30527  0.09937 -11.941  < 2e-16 ***
## windSpeed   -0.19953   0.81912  0.05703  -3.499 0.000467 ***
## pressure    -0.14994   0.86076  0.07606  -1.971 0.048687 *  
## humidity    -0.40031   0.67011  0.09945  -4.025 5.69e-05 ***
## visibility  -0.22837   0.79583  0.06243  -3.658 0.000254 ***
## nebulosity  -0.10825   0.89740  0.06749  -1.604 0.108716    
## cloudHeight  0.22859   1.25683  0.08634   2.648 0.008106 ** 
## moonPhase    0.04129   1.04216  0.05335   0.774 0.438883    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             exp(coef) exp(-coef) lower .95 upper .95
## latitude       1.0467     0.9554    0.8887    1.2329
## longitude      1.1834     0.8450    1.0413    1.3450
## isWarm         0.1258     7.9479    0.0794    0.1994
## temperature    0.3053     3.2758    0.2512    0.3709
## windSpeed      0.8191     1.2208    0.7325    0.9160
## pressure       0.8608     1.1618    0.7416    0.9991
## humidity       0.6701     1.4923    0.5514    0.8143
## visibility     0.7958     1.2566    0.7042    0.8994
## nebulosity     0.8974     1.1143    0.7862    1.0243
## cloudHeight    1.2568     0.7957    1.0612    1.4886
## moonPhase      1.0422     0.9595    0.9387    1.1570
## 
## Concordance= 0.829  (se = 0.012 )
## Likelihood ratio test= 466.1  on 11 df,   p=<2e-16
## Wald test            = 408.9  on 11 df,   p=<2e-16
## Score (logrank) test = 619.1  on 11 df,   p=<2e-16
# MoonPhase, nebulosity, pressure and latitude : not necessary at all
# 11 degrees of freedom
Mbest <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season), data=data)
summary(Mbest)
## Call:
## coxph(formula = Surv(start, stop, event) ~ longitude + latitude + 
##     pressure + temperature + isWarm + windSpeed + humidity + 
##     visibility + cloudHeight + strata(season), data = data)
## 
##   n= 641, number of events= 376 
## 
##                 coef exp(coef) se(coef)      z Pr(>|z|)    
## longitude    0.11459   1.12142  0.06839  1.676 0.093812 .  
## latitude     0.20656   1.22944  0.08176  2.526 0.011522 *  
## pressure    -0.19490   0.82292  0.07478 -2.606 0.009148 ** 
## temperature -0.85406   0.42568  0.10255 -8.328  < 2e-16 ***
## isWarm      -1.63839   0.19429  0.25785 -6.354  2.1e-10 ***
## windSpeed   -0.20731   0.81277  0.05590 -3.709 0.000208 ***
## humidity    -0.30283   0.73873  0.08680 -3.489 0.000485 ***
## visibility  -0.06500   0.93706  0.06053 -1.074 0.282829    
## cloudHeight  0.23976   1.27094  0.08047  2.980 0.002886 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             exp(coef) exp(-coef) lower .95 upper .95
## longitude      1.1214     0.8917    0.9807    1.2823
## latitude       1.2294     0.8134    1.0474    1.4431
## pressure       0.8229     1.2152    0.7107    0.9528
## temperature    0.4257     2.3492    0.3482    0.5205
## isWarm         0.1943     5.1469    0.1172    0.3221
## windSpeed      0.8128     1.2304    0.7284    0.9069
## humidity       0.7387     1.3537    0.6232    0.8757
## visibility     0.9371     1.0672    0.8322    1.0551
## cloudHeight    1.2709     0.7868    1.0855    1.4881
## 
## Concordance= 0.755  (se = 0.018 )
## Likelihood ratio test= 198.6  on 9 df,   p=<2e-16
## Wald test            = 199  on 9 df,   p=<2e-16
## Score (logrank) test = 196.1  on 9 df,   p=<2e-16
tab1 <- tidy(Mbest, exponentiate=T, conf.int=T) # conf.int : confident interval
# p-values are < 0.05 so we accept these covariates, there are statistically significant
tab1
## # A tibble: 9 × 7
##   term        estimate std.error statistic  p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 longitude      1.12     0.0684      1.68 9.38e- 2    0.981     1.28 
## 2 latitude       1.23     0.0818      2.53 1.15e- 2    1.05      1.44 
## 3 pressure       0.823    0.0748     -2.61 9.15e- 3    0.711     0.953
## 4 temperature    0.426    0.103      -8.33 8.23e-17    0.348     0.520
## 5 isWarm         0.194    0.258      -6.35 2.10e-10    0.117     0.322
## 6 windSpeed      0.813    0.0559     -3.71 2.08e- 4    0.728     0.907
## 7 humidity       0.739    0.0868     -3.49 4.85e- 4    0.623     0.876
## 8 visibility     0.937    0.0605     -1.07 2.83e- 1    0.832     1.06 
## 9 cloudHeight    1.27     0.0805      2.98 2.89e- 3    1.09      1.49
Mbest %>% survfit2() %>% ggsurvfit() + add_confidence_interval() + add_risktable() + scale_y_continuous(limits = c(0, 1))

anova(Mall, Mbest)
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
##  Model 2: ~ longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)  
## 1 -1435.0                       
## 2 -1437.9 5.8174  2    0.05455 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p>0.05 -> reject H1, both models are significantly equals -> keep only the covariates from Mbest is sufficient.
# Without strata(season)
Mall_noSeason  <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + nebulosity + moonPhase, data=data)
Mbest_noSeason <- coxph(Surv(start, stop, event)~longitude  + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight , data=data)

anova(Mall_noSeason, Mbest_noSeason)
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + nebulosity + moonPhase
##  Model 2: ~ longitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight
##    loglik  Chisq Df Pr(>|Chi|)
## 1 -1624.3                     
## 2 -1625.9 3.0518  3     0.3837
# p-value is even higher than when considering strata(season)... -> do not take strata(season)

Mbest_noSeason %>% survfit2() %>% ggsurvfit() + add_confidence_interval() + add_risktable() + scale_y_continuous(limits = c(0, 1)) 

Look at Hazard Ratio (HR)

ggforest(Mall_noSeason , main="Hazard ratios and 95% confidence intervals", data=data) # FIXME Not working with strata(season)

ggforest(Mbest_noSeason, main="Hazard ratios and 95% confidence intervals", data=data) # FIXME Not working with strata(season)

# Confidence intervals are relatively small -> good !
# line is for HR=1 ; 
# HR of isWarm is only <0.1 -> less than 10% of the hazard

ggcoef_model(Mbest_noSeason, exponentiate=TRUE)

ggcoef_model(Mall_noSeason , exponentiate=TRUE)

Time-dependency risks - Schoenfeld test

# Try to identify the covariates with time-varying effects
# test : H0 = HR for the covariate is constant over time 
cox.zph(Mbest)
##             chisq df       p
## longitude   18.55  1 1.7e-05
## latitude    24.64  1 6.9e-07
## pressure     9.55  1 0.00199
## temperature 13.98  1 0.00019
## isWarm       7.46  1 0.00629
## windSpeed    7.05  1 0.00793
## humidity    16.21  1 5.7e-05
## visibility   1.57  1 0.21018
## cloudHeight  5.49  1 0.01915
## GLOBAL      72.26  9 5.5e-12
par(mfrow=c(3,3), mai = c(0.6, 0.6, 0.1, 0.1))
plot(cox.zph(Mbest))

# p-values are all < 0.05 --> reject H0 -> time variations

cox.zph(Mbest_noSeason)
##               chisq df       p
## longitude   14.4625  1 0.00014
## pressure     7.3747  1 0.00661
## temperature 28.2379  1 1.1e-07
## isWarm       0.0815  1 0.77532
## windSpeed    4.2628  1 0.03895
## humidity    25.7813  1 3.8e-07
## visibility   1.0897  1 0.29653
## cloudHeight 11.0147  1 0.00090
## GLOBAL      75.4463  8 4.0e-13
plot(cox.zph(Mbest_noSeason))

# Looking at data, temperature seems to be linear dependent with time, try:
Mbest <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season), data=data)

Mbest_tt1 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season), data=data, tt=function(x,t,...) x/t)
Mbest_tt2 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season) , data=data, tt=function(x,t,...) x*t)
Mbest_tt3 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season), data=data, tt=function(x,t,...) x*log(t))

anova(Mbest_tt1, Mbest_tt2) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##  Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1423.9                         
## 2 -1407.8 32.269  0  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Mbest_tt1, Mbest_tt3) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##  Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1423.9                         
## 2 -1411.6 24.493  0  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Mbest_tt2, Mbest_tt3) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
##  Cox model: response is  Surv(start, stop, event)
##  Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##  Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1407.8                         
## 2 -1411.6 7.7764  0  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Termplot figures for model tt1 show slower values -> take this one as favorite. 
termplot(Mbest_tt1, se=TRUE)

termplot(Mbest_tt2, se=TRUE)

termplot(Mbest_tt3, se=TRUE)

Residuals of selected models

residuals_best <- residuals(Mbest    , type="martingale")
residuals_tt1  <- residuals(Mbest_tt1, type="martingale")
residuals_tt2  <- residuals(Mbest_tt2, type="martingale")
residuals_tt3  <- residuals(Mbest_tt3, type="martingale")
par(mfrow = c(1, 4), mar = c(4, 4, 3, 1))

plot(residuals_best, ylim=c(-6,1), main="BEST")
plot(residuals_tt1 , ylim=c(-6,1), main="TT1")
plot(residuals_tt2 , ylim=c(-6,1), main="TT2")
plot(residuals_tt3 , ylim=c(-6,1), main="TT3")

# Results are similare for differents time-dependent function...

par(mfrow = c(1,1), mar = c(3, 4, 3, 1))
# residuals for NONE is far from 0 (=1) while BEST and ALL is closer to 0.
# Many outliers -> try to identify them

boxplot(residuals_best, residuals_tt1, names=c("BEST", "TT1"))

boxplot(residuals_tt1, residuals_tt2, residuals_tt3, names=c("TT1", "TT2", "TT3"))

par(mfrow = c(2,3), mar = c(3, 4, 3, 1))

hist(residuals_best, xlim=c(-4, 1), breaks=20, main="Best model")
hist(residuals_tt1 , xlim=c(-4, 1), breaks=6, main="Time-varying best model")
boxplot(residuals_best, residuals_tt1, names=c("Best", "Time-varying"))

predict_best <- predict(Mbest   , type="lp")
predict_tt   <- predict(Mbest_tt1, type = "lp")

par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

plot(predict_best, residuals_best, ylim=c(-2,1), xlab="Linear predictor", ylab="Martingale residuals", main="Best model")
lines(lowess(predict_best, residuals_best), col="red", lwd=2)
abline(h=0, lty=2)

plot(predict_tt  , residuals_tt1 , ylim=c(-2,1), xlab="Linear predictor", ylab="Martingale residuals", main="Time-varying best model")
lines(lowess(predict_tt, residuals_tt1), col="red", lwd=2)
abline(h=0, lty=2)

# Residuals close to +1 indicate subjects who experienced the event but the model predicted a very low hazard (possibly early events or unusual cases).
# This could happen if the model underfits or the time-dependent term hasn't fully captured hazard variation.
# - Residuals near +1 = events with underestimated risk by the model.
# - Residuals near 0 = well-predicted outcomes.
# - Residuals near -1 = censored observations with large predicted hazard (over-predicted risk).